{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['__output__.json', '__results___files', '__notebook__.ipynb', '__results__.html', 'custom.css', 'temp.csv']\n"
     ]
    }
   ],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load in \n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import scipy as sp\n",
    "\n",
    "from lightgbm.sklearn import LGBMClassifier\n",
    "from sklearn.model_selection import cross_val_score, train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Input data files are available in the \"../input/\" directory.\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n",
    "\n",
    "import os\n",
    "print(os.listdir(\"../input\"))\n",
    "\n",
    "# Any results you write to the current directory are saved as output."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>id</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>device_id</th>\n",
       "      <th>device_ip</th>\n",
       "      <th>device_model</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "      <th>day</th>\n",
       "      <th>hour_day</th>\n",
       "      <th>click</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1.000009e+18</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>ddd2926e</td>\n",
       "      <td>44956a24</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>15706</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1.000017e+19</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>96809ac8</td>\n",
       "      <td>711ee120</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15704</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1.000037e+19</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>b3cf8def</td>\n",
       "      <td>8a4875bd</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15704</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1.000064e+19</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>e8275b8f</td>\n",
       "      <td>6332421a</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15706</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1.000068e+19</td>\n",
       "      <td>1005</td>\n",
       "      <td>1</td>\n",
       "      <td>fe8cc448</td>\n",
       "      <td>9166c161</td>\n",
       "      <td>0569f928</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>07d7df22</td>\n",
       "      <td>a99f214a</td>\n",
       "      <td>9644d0bf</td>\n",
       "      <td>779d90c2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>18993</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>157</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0            id    C1  banner_pos  ...   C21 day hour_day click\n",
       "0           0  1.000009e+18  1005           0  ...    79  21        0     0\n",
       "1           1  1.000017e+19  1005           0  ...    79  21        0     0\n",
       "2           2  1.000037e+19  1005           0  ...    79  21        0     0\n",
       "3           3  1.000064e+19  1005           0  ...    79  21        0     0\n",
       "4           4  1.000068e+19  1005           1  ...   157  21        0     0\n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = pd.read_csv('../input/temp.csv')\n",
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Unnamed: 0', 'id', 'C1', 'banner_pos', 'site_id', 'site_domain',\n",
       "       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',\n",
       "       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',\n",
       "       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'day', 'hour_day',\n",
       "       'click'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C1:7\n",
      "banner_pos:7\n",
      "site_id:3496\n",
      "site_domain:4585\n",
      "site_category:23\n",
      "app_id:5469\n",
      "app_domain:390\n",
      "app_category:33\n",
      "device_id:786741\n",
      "device_ip:2129662\n",
      "device_model:6863\n",
      "device_type:4\n",
      "device_conn_type:4\n",
      "C14:1030\n",
      "C15:8\n",
      "C16:9\n",
      "C17:226\n",
      "C18:4\n",
      "C19:47\n",
      "C20:168\n",
      "C21:42\n",
      "day:3\n",
      "hour_day:24\n"
     ]
    }
   ],
   "source": [
    "cols = ['C1', 'banner_pos', 'site_id', 'site_domain',\n",
    "       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',\n",
    "       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',\n",
    "       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'day', 'hour_day']\n",
    "for col in cols:\n",
    "    print('%s:%d'%(col, train_data[col].value_counts().shape[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1、准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_data = train_data[cols]\n",
    "y_data = train_data.click"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>app_category</th>\n",
       "      <th>device_id</th>\n",
       "      <th>device_ip</th>\n",
       "      <th>device_model</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "      <th>day</th>\n",
       "      <th>hour_day</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>442</td>\n",
       "      <td>4358</td>\n",
       "      <td>2</td>\n",
       "      <td>5031</td>\n",
       "      <td>175</td>\n",
       "      <td>0</td>\n",
       "      <td>520971</td>\n",
       "      <td>1845413</td>\n",
       "      <td>1819</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>256</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>442</td>\n",
       "      <td>4358</td>\n",
       "      <td>2</td>\n",
       "      <td>5031</td>\n",
       "      <td>175</td>\n",
       "      <td>0</td>\n",
       "      <td>520971</td>\n",
       "      <td>1252534</td>\n",
       "      <td>3017</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>254</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>63</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>442</td>\n",
       "      <td>4358</td>\n",
       "      <td>2</td>\n",
       "      <td>5031</td>\n",
       "      <td>175</td>\n",
       "      <td>0</td>\n",
       "      <td>520971</td>\n",
       "      <td>1495906</td>\n",
       "      <td>3716</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>254</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>63</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>442</td>\n",
       "      <td>4358</td>\n",
       "      <td>2</td>\n",
       "      <td>5031</td>\n",
       "      <td>175</td>\n",
       "      <td>0</td>\n",
       "      <td>520971</td>\n",
       "      <td>1931641</td>\n",
       "      <td>2640</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>256</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>63</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3460</td>\n",
       "      <td>2624</td>\n",
       "      <td>0</td>\n",
       "      <td>5031</td>\n",
       "      <td>175</td>\n",
       "      <td>0</td>\n",
       "      <td>520971</td>\n",
       "      <td>1250578</td>\n",
       "      <td>3199</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>389</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>96</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   C1  banner_pos  site_id  site_domain    ...     C20  C21  day  hour_day\n",
       "0   2           0      442         4358    ...       0   20    0         0\n",
       "1   2           0      442         4358    ...      63   20    0         0\n",
       "2   2           0      442         4358    ...      63   20    0         0\n",
       "3   2           0      442         4358    ...      63   20    0         0\n",
       "4   2           1     3460         2624    ...       0   37    0         0\n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for col in cols:\n",
    "    X_data[col] = LabelEncoder().fit_transform(X_data[col])\n",
    "X_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X_data, y_data, random_state=80, test_size=0.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2、直接进行LightGBM的简单建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py:741: UserWarning: categorical_feature keyword has been found in `params` and will be ignored.\n",
      "Please use categorical_feature argument of the Dataset constructor to pass this parameter.\n",
      "  .format(key))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LGBMClassifier(boosting_type='goss',\n",
       "        categorical_feature=[0, 1, 3, 5, 6, 12, 15, 16, 17, 18, 19, 20],\n",
       "        class_weight=None, colsample_bytree=0.4, importance_type='split',\n",
       "        is_unbalance=True, learning_rate=0.01, max_depth=8,\n",
       "        min_child_samples=40, min_child_weight=0.001, min_split_gain=0.0,\n",
       "        n_estimators=100, n_jobs=4, num_leaves=75, objective='binary',\n",
       "        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,\n",
       "        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "params = {'boosting_type': 'goss',\n",
    "          'objective': 'binary',\n",
    "          'is_unbalance':True,\n",
    "          'categorical_feature': [0,1,3,5,6,12,15,16,17,18,19,20],\n",
    "          'n_jobs': 4,\n",
    "          'learning_rate': 0.01,\n",
    "          #'n_estimators':n_estimators_1,\n",
    "          'num_leaves': 75,\n",
    "          'max_depth': 8,\n",
    "          'min_child_samples':40,\n",
    "          'colsample_bytree': 0.4\n",
    "         }\n",
    "\n",
    "lg = LGBMClassifier(silent=False,  **params)\n",
    "lg.fit(X_train_data, y_train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>columns</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>site_domain</td>\n",
       "      <td>911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>C17</td>\n",
       "      <td>775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>app_id</td>\n",
       "      <td>697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>site_id</td>\n",
       "      <td>689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>C14</td>\n",
       "      <td>581</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>C20</td>\n",
       "      <td>553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>device_model</td>\n",
       "      <td>466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>app_domain</td>\n",
       "      <td>420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>C21</td>\n",
       "      <td>326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>C19</td>\n",
       "      <td>276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>site_category</td>\n",
       "      <td>273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>app_category</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>hour_day</td>\n",
       "      <td>176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>device_id</td>\n",
       "      <td>164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>device_conn_type</td>\n",
       "      <td>119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>day</td>\n",
       "      <td>116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>C18</td>\n",
       "      <td>116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>C15</td>\n",
       "      <td>110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>banner_pos</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>C16</td>\n",
       "      <td>92</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>device_type</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>device_ip</td>\n",
       "      <td>56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C1</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             columns  importance\n",
       "3        site_domain         911\n",
       "16               C17         775\n",
       "5             app_id         697\n",
       "2            site_id         689\n",
       "13               C14         581\n",
       "19               C20         553\n",
       "10      device_model         466\n",
       "6         app_domain         420\n",
       "20               C21         326\n",
       "18               C19         276\n",
       "4      site_category         273\n",
       "7       app_category         240\n",
       "22          hour_day         176\n",
       "8          device_id         164\n",
       "12  device_conn_type         119\n",
       "21               day         116\n",
       "17               C18         116\n",
       "14               C15         110\n",
       "1         banner_pos         100\n",
       "15               C16          92\n",
       "11       device_type          90\n",
       "9          device_ip          56\n",
       "0                 C1          54"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({\"columns\":list(X_train_data.columns), \"importance\":list(lg.feature_importances_.T)})\n",
    "df = df.sort_values(by=['importance'],ascending=False)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3、预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pre_proba = lg.predict_proba(X_test_data)\n",
    "one_proba = pd.DataFrame(y_pre_proba)[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "logloss=0.4948348736767893\n"
     ]
    }
   ],
   "source": [
    "def logloss(act, pred):\n",
    "    epsilon = 1e-15\n",
    "    pred = sp.maximum(epsilon, pred)\n",
    "    pred = sp.minimum(1.0-epsilon, pred)\n",
    "    ll = (act*sp.log(pred) + sp.subtract(1.0,act)*sp.log(sp.subtract(1.0,pred))).sum()\n",
    "    ll = ll*(-1.0/len(act))\n",
    "    return ll\n",
    "\n",
    "print('logloss={}'.format(logloss(y_test_data, one_proba)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 后面的工作\n",
    "## 特征工程\n",
    "- 一些可能对广告点击影响较大的变量做些特殊处理，例如日期和时间\n",
    "- 没有具体含义的那些特征考虑应该怎么处理\n",
    "- 特征类别较多的特征进行处理\n",
    "\n",
    "## 模型选择和训练\n",
    "- LightGBM模型的在特征工程进一步处理后做调参优化，但是根据作业中调参的情况，根据变化应该不是很大\n",
    "- 选择Xgboost模型进行建模\n",
    "- 选择WND模型进行建模\n",
    "- 尝试建议的FFM和FTPL模型，但是对这两个模型都不太了解，所以视前面特征处理和GBDT模型建模的进展决定"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
